#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Introduction

Check out this Kaggle

Business Goals

  1. Predict the number of bookings for the resort and city hotels for the next 4 weeks
  2. Classify a booking as a resort or city type
  3. Predict average daily rate (adr) prices based on other features

Ideas

  1. logistic regression / random forests / svm
  2. fb’s prophet package
  3. elastic net, random forest, regression

Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  ) %>% filter(is_canceled == 0) #filter to non-canceled bookings

a$is_canceled = NULL

Split Data

split = initial_split(a)
train = rsample::training(split)
test = rsample::testing(split)

Resources

  1. adr
  2. transient
  3. group rate
  4. Guide to Hotel Management
  5. distribution channels

5 min EDA

train %>% head

Not sure exactly what ‘reservation_status_date’ refers to; for a date field, will create an ‘arrival.date’ col to use in its place

skimr::skim(train)
Data summary
Name train
Number of rows 56375
Number of columns 31
_______________________
Column type frequency:
Date 1
factor 13
numeric 17
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2015-07-01 2017-09-14 2016-09-03 805

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
agent 0 1 FALSE 301 9: 14077, NUL: 9223, 240: 6333, 7: 2285
arrival_date_month 0 1 FALSE 12 Aug: 6492, Jul: 5869, May: 5297, Oct: 5208
assigned_room_type 0 1 FALSE 10 A: 30827, D: 14290, E: 4365, F: 2096
company 0 1 FALSE 310 NUL: 52171, 40: 628, 223: 503, 45: 150
country 0 1 FALSE 156 PRT: 15759, GBR: 7226, FRA: 6425, ESP: 4768
customer_type 0 1 FALSE 4 Tra: 39832, Tra: 13983, Con: 2158, Gro: 402
deposit_type 0 1 FALSE 3 No : 56212, Ref: 93, Non: 70
distribution_channel 0 1 FALSE 5 TA/: 43311, Dir: 9038, Cor: 3917, GDS: 108
hotel 0 1 FALSE 2 Cit: 34665, Res: 21710
market_segment 0 1 FALSE 7 Onl: 26836, Off: 11933, Dir: 8002, Gro: 5749
meal 0 1 FALSE 5 BB: 43389, HB: 7108, SC: 4988, Und: 641
reservation_status 0 1 FALSE 1 Che: 56375, Can: 0, No-: 0
reserved_room_type 0 1 FALSE 9 A: 39242, D: 9900, E: 3484, F: 1489

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
adr 0 1 100.01 49.17 -6.38 67.5 92.4 125 510 <U+2587><U+2586><U+2581><U+2581><U+2581>
adults 0 1 1.83 0.51 0.00 2.0 2.0 2 4 <U+2581><U+2582><U+2587><U+2581><U+2581>
arrival_date_day_of_month 0 1 15.84 8.79 1.00 8.0 16.0 23 31 <U+2587><U+2587><U+2587><U+2587><U+2586>
arrival_date_week_number 0 1 27.11 13.91 1.00 16.0 28.0 38 53 <U+2586><U+2587><U+2587><U+2587><U+2586>
arrival_date_year 0 1 2016.15 0.70 2015.00 2016.0 2016.0 2017 2017 <U+2583><U+2581><U+2587><U+2581><U+2586>
babies 0 1 0.01 0.12 0.00 0.0 0.0 0 10 <U+2587><U+2581><U+2581><U+2581><U+2581>
booking_changes 0 1 0.30 0.74 0.00 0.0 0.0 0 21 <U+2587><U+2581><U+2581><U+2581><U+2581>
children 0 1 0.10 0.39 0.00 0.0 0.0 0 3 <U+2587><U+2581><U+2581><U+2581><U+2581>
days_in_waiting_list 0 1 1.65 15.12 0.00 0.0 0.0 0 379 <U+2587><U+2581><U+2581><U+2581><U+2581>
is_repeated_guest 0 1 0.04 0.20 0.00 0.0 0.0 0 1 <U+2587><U+2581><U+2581><U+2581><U+2581>
lead_time 0 1 80.23 91.28 0.00 9.0 45.0 125 709 <U+2587><U+2582><U+2581><U+2581><U+2581>
previous_bookings_not_canceled 0 1 0.20 1.82 0.00 0.0 0.0 0 72 <U+2587><U+2581><U+2581><U+2581><U+2581>
previous_cancellations 0 1 0.02 0.28 0.00 0.0 0.0 0 13 <U+2587><U+2581><U+2581><U+2581><U+2581>
required_car_parking_spaces 0 1 0.10 0.31 0.00 0.0 0.0 0 8 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_week_nights 0 1 2.46 1.92 0.00 1.0 2.0 3 42 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_weekend_nights 0 1 0.93 0.99 0.00 0.0 1.0 2 18 <U+2587><U+2581><U+2581><U+2581><U+2581>
total_of_special_requests 0 1 0.72 0.84 0.00 0.0 1.0 1 5 <U+2587><U+2581><U+2581><U+2581><U+2581>

clean/encode data

# make arrival date var
train = train %>% mutate(
  arrival.date = make_date(
    year = arrival_date_year,
    month = match(arrival_date_month, month.name),
    day = arrival_date_day_of_month)
  )

# these numeric vars s/b factor vars
train = train %>% mutate_at(vars(arrival_date_day_of_month, arrival_date_week_number, arrival_date_year, is_repeated_guest), factor)

# reordering df
train = train %>% select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

You’ll have to repeat the steps above for the test data when preprocessing with recipes and step_XXX’

EDA: time series

range

paste(
  'The date range of this dataset is from',
  train %>% pull(arrival.date) %>% range %>% .[1],
  'to',
  train %>% pull(arrival.date) %>% range %>% .[2],
  ', just over 3 years of data.'
)
## [1] "The date range of this dataset is from 2015-07-01 to 2017-08-31 , just over 3 years of data."

time series graph – ungrouped

train %>% group_by(arrival.date) %>% 
  summarise(total.bookings = sum(adults, children)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.bookings
  ) %>% layout(
    title = 'total.bookings by date',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (hotel)

train %>% group_by(arrival.date, hotel) %>% 
  summarise(total.bookings = sum(adults, children)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.bookings,
    color = ~hotel,
    alphtrain = 0.7
  )  %>% layout(
    title = 'total.bookings by date/hotel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    ) 

time series graph – grouped (customer_type)

train %>% group_by(arrival.date, customer_type) %>% 
  summarise(total.bookings = sum(adults, children)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.bookings,
    color = ~customer_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.bookings by date/customer_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (deposit_type)

train %>% group_by(arrival.date, deposit_type) %>% 
  summarise(total.bookings = sum(adults, children)) %>%   
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.bookings,
    color = ~deposit_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.bookings by date/deposit_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (distribution_channel)

train %>% group_by(arrival.date, distribution_channel) %>% 
  summarise(total.bookings = sum(adults, children)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.bookings,
    color = ~distribution_channel,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.bookings by date/distribution_channel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

EDA: nom vars

sample data

train %>% select(where(is.factor)) %>% slice_sample(n = 10)

glimpse structure

train %>% select(where(is.factor)) %>% glimpse
## Rows: 56,375
## Columns: 17
## $ agent                     <fct> NULL, NULL, 240, 240, NULL, 303, 241, 241...
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ arrival_date_month        <fct> July, July, July, July, July, July, July,...
## $ arrival_date_week_number  <fct> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...
## $ arrival_date_year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
## $ assigned_room_type        <fct> C, C, A, A, C, C, G, E, E, G, F, A, A, D,...
## $ company                   <fct> NULL, NULL, NULL, NULL, NULL, NULL, NULL,...
## $ country                   <fct> PRT, GBR, GBR, GBR, PRT, PRT, ESP, PRT, P...
## $ customer_type             <fct> Transient, Transient, Transient, Transien...
## $ deposit_type              <fct> No Deposit, No Deposit, No Deposit, No De...
## $ distribution_channel      <fct> Direct, Direct, TA/TO, TA/TO, Direct, Dir...
## $ hotel                     <fct> Resort Hotel, Resort Hotel, Resort Hotel,...
## $ is_repeated_guest         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ market_segment            <fct> Direct, Direct, Online TA, Online TA, Dir...
## $ meal                      <fct> BB, BB, BB, BB, BB, FB, HB, BB, BB, BB, B...
## $ reservation_status        <fct> Check-Out, Check-Out, Check-Out, Check-Ou...
## $ reserved_room_type        <fct> C, A, A, A, C, C, G, E, E, G, F, A, A, D,...

check missing values

train %>% select(where(is.factor)) %>% miss_var_summary

distribution of level counts per factor

jpal = colorRampPalette(brewer.pal(8,'Dark2'))(15)

train %>% select(where(is.factor)) %>%
  map(n_unique) %>%
  as.tibble() %>%
  pivot_longer(everything()) %>%
  plot_ly(y = ~name, x = ~value, color = ~name, colors = jpal) %>%
  add_bars() %>%
  hide_legend() %>% 
  layout(
    title = 'distribution of level counts per factor',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

reference: names of unique levels

train %>% select(where(is.factor)) %>%
  map(unique)
## $agent
##   [1] NULL 240  303  241  8    250  115  134  156  243  242  5    40   105  147 
##  [16] 306  184  175  96   2    127  95   6    244  15   167  300  305  196  152 
##  [31] 142  171  177  36   67   104  261  149  26   258  71   146  181  88   143 
##  [46] 251  275  69   256  314  110  126  281  208  253  330  328  326  334  185 
##  [61] 321  324  313  38   155  68   308  332  387  298  273  248  315  75   307 
##  [76] 436  201  183  223  94   3    16   468  446  327  34   78   385  339  139 
##  [91] 9    270  47   128  154  114  66   29   301  245  193  1    182  336  135 
## [106] 348  350  195  352  355  159  10   384  363  360  375  331  367  91   64  
## [121] 393  168  406  249  405  414  333  11   427  431  429  426  430  438  418 
## [136] 441  282  432  72   450  434  454  455  59   368  254  57   180  358  464 
## [151] 411  481  165  467  510  531  440  337  526  493  420  502  527  469  410 
## [166] 479  508  163  535  302  13   7    27   22   17   28   14   42   20   19  
## [181] 45   37   61   39   21   24   50   30   52   12   44   31   83   32   63  
## [196] 56   89   86   79   132  4    70   82   81   74   92   99   85   87   112 
## [211] 106  98   111  119  148  151  138  103  121  118  211  153  210  187  129 
## [226] 213  174  220  173  216  232  35   23   58   205  157  133  150  214  290 
## [241] 192  191  267  215  252  247  219  280  285  289  269  295  265  335  288 
## [256] 122  325  234  341  310  344  77   346  359  283  364  370  33   371  25  
## [271] 53   227  141  378  391  397  416  404  73   354  444  296  425  461  394 
## [286] 390  388  453  262  459  474  229  475  480  423  484  495  509  449 
## 334 Levels: 1 10 103 104 105 106 107 11 110 111 112 114 115 117 118 119 ... NULL
## 
## $arrival_date_day_of_month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $arrival_date_month
##  [1] July      August    September October   November  December  January  
##  [8] February  March     April     May       June     
## 12 Levels: April August December February January July June March ... September
## 
## $arrival_date_week_number
##  [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [26] 52 53 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [51] 24 25 26
## 53 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 53
## 
## $arrival_date_year
## [1] 2015 2016 2017
## Levels: 2015 2016 2017
## 
## $assigned_room_type
##  [1] C A G E F D I B H K
## Levels: A B C D E F G H I K L P
## 
## $company
##   [1] NULL 110  113  270  240  154  144  307  268  59   312  318  274  174  195 
##  [16] 223  317  118  281  47   324  342  371  286  88   331  178  405  367  337 
##  [31] 20   94   53   528  62   120  42   82   81   116  530  103  112  135  9   
##  [46] 39   204  16   92   31   61   356  12   457  501  86   165  291  292  290 
##  [61] 192  108  51   43   34   224  388  269  465  287  297  490  482  207  169 
##  [76] 282  437  263  225  329  28   200  338  72   319  146  159  380  323  407 
##  [91] 421  325  80   403  399  14   84   343  346  347  349  289  351  355  353 
## [106] 99   250  358  361  390  362  366  365  277  109  377  379  22   378  83  
## [121] 364  360  401  232  511  384  167  212  514  391  400  376  392  396  302 
## [136] 370  397  369  409  251  168  104  382  408  413  148  10   333  419  415 
## [151] 424  425  423  428  422  395  435  439  445  448  443  454  444  394  52  
## [166] 459  458  456  460  447  470  255  466  484  184  485  32   487  491  494 
## [181] 193  516  499  308  29   78   254  504  130  520  507  498  515  512  126 
## [196] 64   330  242  477  518  521  523  539  534  436  525  541  40   455  45  
## [211] 38   49   67   68   8    46   76   96   115  105  93   101  11   91   137 
## [226] 139  142  127  107  140  143  149  150  163  85   180  238  219  179  221 
## [241] 183  153  197  203  185  217  209  215  213  186  237  216  230  234  227 
## [256] 246  245  218  158  258  259  260  411  272  257  271  18   275  210  273 
## [271] 278  71   284  301  233  305  293  264  311  304  313  288  320  334  314 
## [286] 332  341  352  350  372  73   383  368  393  220  412  420  410  426  417 
## [301] 243  429  433  446  450  418  280  357  479  478  483  489  486  481  497 
## [316] 451  492 
## 353 Levels: 10 100 101 102 103 104 105 106 107 108 109 11 110 112 113 ... NULL
## 
## $country
##   [1] PRT  GBR  ESP  IRL  FRA  NULL ROU  NOR  ARG  USA  POL  DEU  CHE  CN   GRC 
##  [16] ITA  DNK  RUS  SWE  EST  CZE  FIN  AUS  MOZ  BWA  NLD  ALB  BRA  BEL  IND 
##  [31] CHN  MAR  SVN  UKR  SMR  LVA  SRB  CHL  AUT  BLR  LTU  OMN  TUR  MEX  ISR 
##  [46] LUX  ZMB  CPV  ZWE  DZA  KOR  CRI  HUN  HRV  CAF  CYP  NZL  IDN  THA  DOM 
##  [61] NGA  GIB  ARM  LKA  CUB  COL  ZAF  CMR  IRN  BIH  MUS  COM  JAM  UGA  BGR 
##  [76] CIV  JOR  PRI  JPN  MYS  AGO  URY  SAU  SGP  KWT  LBN  AND  VNM  PLW  ARE 
##  [91] QAT  EGY  SVK  SUR  MDV  MLT  MWI  ISL  UZB  ECU  NPL  KAZ  BHS  PAK  MAC 
## [106] TGO  TWN  HKG  DJI  PHL  GEO  TUN  STP  VEN  AZE  SEN  PER  KNA  ETH  IRQ 
## [121] RWA  MMR  TMP  BFA  KEN  MCO  LBY  PAN  GNB  TZA  BGD  BHR  NAM  BOL  SYC 
## [136] BRB  AIA  SLV  PYF  LIE  ATA  GAB  MKD  MNE  GTM  GHA  ASM  PRY  SYR  ABW 
## [151] NCL  ATF  SLE  LAO  FRO 
## 178 Levels: ABW AGO AIA ALB AND ARE ARG ARM ASM ATA ATF AUS AUT AZE BDI ... ZWE
## 
## $customer_type
## [1] Transient       Contract        Transient-Party Group          
## Levels: Contract Group Transient Transient-Party
## 
## $deposit_type
## [1] No Deposit Refundable Non Refund
## Levels: No Deposit Non Refund Refundable
## 
## $distribution_channel
## [1] Direct    TA/TO     Corporate Undefined GDS      
## Levels: Corporate Direct GDS TA/TO Undefined
## 
## $hotel
## [1] Resort Hotel City Hotel  
## Levels: City Hotel Resort Hotel
## 
## $is_repeated_guest
## [1] 0 1
## Levels: 0 1
## 
## $market_segment
## [1] Direct        Online TA     Offline TA/TO Corporate     Complementary
## [6] Groups        Aviation     
## 8 Levels: Aviation Complementary Corporate Direct Groups ... Undefined
## 
## $meal
## [1] BB        FB        HB        SC        Undefined
## Levels: BB FB HB SC Undefined
## 
## $reservation_status
## [1] Check-Out
## Levels: Canceled Check-Out No-Show
## 
## $reserved_room_type
## [1] C A G E D F H L B
## Levels: A B C D E F G H L P
train = train %>% mutate(arrival_date_month = factor(arrival_date_month, levels = c('January','February','March','April','May','June','July','August','September','October','November','December')))

EDA: num vars

check missing values

train %>% select(where(is.numeric)) %>% miss_var_summary

sample data

train %>% select(where(is.numeric)) %>% slice_sample(n = 10)

glimpse structure

train %>% select(where(is.numeric)) %>% glimpse
## Rows: 56,375
## Columns: 13
## $ adr                            <dbl> 0.00, 0.00, 75.00, 98.00, 98.00, 107...
## $ adults                         <dbl> 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ lead_time                      <dbl> 342, 737, 7, 14, 14, 0, 9, 18, 37, 6...
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <dbl> 0, 0, 1, 2, 2, 2, 2, 4, 4, 4, 4, 1, ...
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 3, 0, 0, ...

viz: outliers

train %>% select(where(is.numeric)) %>% dlookr::plot_outlier()

There are many upper outliers. When building a prediction model, perhaps we should consider transforming them or removing them entirely.

check right tailness

jquantiles = function(col){quantile(col, probs = c(0.90, 0.95, 0.99, 1))}

train %>% na.omit %>% select(where(is.numeric)) %>%
  map(.x = . , jquantiles) %>%
  as.data.frame.list() %>%
  rownames_to_column() %>%
  as.tibble()

viz: normality

train %>% select(where(is.numeric)) %>% dlookr::plot_normality()

viz: distribution histogram (with outliers)

#with outliers
train %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution histogram (without most extreme outliers)

#no extreme right tailed outliers
train %>% select(where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution bivariate (without extreme outliers)

#no outliers
train %>% select(hotel, where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_boxplot(by = 'hotel', nrow = 3, ncol = 1)

EDA: ADR further investigation

CAUTION: You should calculate ADR per adult since the ds is at the booking level not the individual level. Also you should remove bookings with children

#check: check number of bookings by combination of adult and children counts
train %>% count(adults, children)
train %>% count(adults, children) %>% filter(adults == 0) %>% summarise(total.bookings.without.adults = sum(n))
train %>% count(adults, children) %>% filter(children == 0) %>% summarise(total.bookings.without.children = sum(n))
train %>% count(adults, children) %>% filter(children == 0, adults == 0) %>% summarise(total.bookings.without.adults.and.children = sum(n))

There are very few bookings with children, but it’s good to filter out to get the most accurate results.’

percent of ds with only adult booking data

paste(
  scales::percent(
    nrow(train %>% filter(adults > 0, children == 0)) / nrow(train)
    ),
  'of all observations have 1 or more adults and zero children.'
)
## [1] "93% of all observations have 1 or more adults and zero children."
train.adult.adrs = train %>% filter(adults > 0, children == 0)

distribution ADR per child

train %>% filter(adults ==0, children >0) %>% mutate(adr.per.child = adr/children) %>% plot_ly(x = ~adr.per.child)  %>% add_boxplot()

create adr.per.adult var

#creating adr per adult var
train.adult.adrs = train.adult.adrs %>% mutate(adr.per.adult = adr/adults)

train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~hotel, colors = jpal[1:2]) %>% 
  add_boxplot() %>% 
  layout(
    title = 'ADR per Adult by Hotel Type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
  )
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~customer_type, colors = jpal, group = ~customer_type) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/customer_type'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr, color = ~market_segment, colors = jpal, group = ~market_segment) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/market_segment'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(x = ~hotel, y = ~adr, color = ~arrival_date_month, colors = jpal, group = ~arrival_date_month) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/arrival_date_month',
    hoverformat = '.0f'
    ) 
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%

correlations: viz

train %>% select(where(is.numeric)) %>% dlookr::plot_correlate()

train %>% select(where(is.numeric)) %>% GGally::ggcorr(palette = "RdBu", label = TRUE)

Goal 1: Predict Number of Bookings by Hotel Type

Anomaly Detection

library(anomalize)
# time_decompose(data, target, method = c("stl", "twitter"), frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# The alpha parameter adjusts the width of the critical values. By default, alpha = 0.05.
# Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.

# The STL method uses the stl() function from the stats package. STL works very well in circumstances where a long term trend is present (which applies in this case; see trend component in the prophet graphs below'). 
  
#use full data set, filter to hotel type, arrange by date
a1 = a %>% mutate(
  arrival.date = make_date(
    year = arrival_date_year,
    month = match(arrival_date_month, month.name),
    day = arrival_date_day_of_month)
  )

(anomaly.hotel.resort = a1 %>% filter(hotel == 'Resort Hotel') %>% 
  group_by(arrival.date, hotel) %>% 
  summarise(total.bookings = sum(adults, children)) %>% 
  select(arrival.date, hotel, total.bookings) %>% 
  arrange(arrival.date) %>% as.tibble() %>% 
  time_decompose(total.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
(anomaly.hotel.city = a1 %>% filter(hotel == 'City Hotel') %>% 
  group_by(arrival.date, hotel) %>% 
  summarise(total.bookings = sum(adults, children)) %>% 
  select(arrival.date, hotel, total.bookings) %>% 
  arrange(arrival.date) %>% as.tibble() %>% 
  time_decompose(total.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
ggplotly(
  anomaly.hotel.resort %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'resort hotel total.bookings')
  )
ggplotly(
  anomaly.hotel.city %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'city hotel total.bookings')
  )

Predicting Next 2 Wks Total Bookings

Resort Hotel

library(prophet)
library(dygraphs)
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.resort.df = anomaly.hotel.resort %>% select(ds = arrival.date, y = total.bookings)

#creating model
prophet.resort.mdl = prophet.resort.df %>% prophet()

#using model make future period df
prophet.resort.future.df = prophet.resort.mdl %>% make_future_dataframe(
  periods = 28, #next 4 wks
  freq = 'day',
  include_history = TRUE
  )

#make forecasts df
prophet.resort.forecast.df = prophet.resort.mdl %>% predict(prophet.resort.future.df)

prophet.resort.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.resort.mdl %>% dyplot.prophet(
  prophet.resort.forecast.df,
  main = '<h style="color: black; font-size:18px;">Resort Hotel: Total Bookings 1 Month Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkorange','blue'),
    pointSize = 2,
    )
#plot forecast components
prophet.resort.mdl %>% prophet_plot_components(prophet.resort.forecast.df)

City Hotel

library(prophet)
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.city.df = anomaly.hotel.city %>% select(ds = arrival.date, y = total.bookings)

#creating model
prophet.city.mdl = prophet.city.df %>% prophet()
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.city.future.df = prophet.city.mdl %>% make_future_dataframe(
  periods = 28, #next 4 wks
  freq = 'day',
  include_history = TRUE
  )

#make forecasts df
prophet.city.forecast.df = prophet.city.mdl %>% predict(prophet.city.future.df)

prophet.city.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.city.mdl %>% dyplot.prophet(
  prophet.city.forecast.df,
  main = '<h style="color: black; font-size:18px;">City Hotel: Total Bookings 1 Month Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkgreen','blue'),
    pointSize = 2,
    )
#plot forecast components
prophet.city.mdl %>% prophet_plot_components(prophet.city.forecast.df)